library(ggplot2)
library(e1071)
library(class)
library(tm)
library(tidyr)
library(plyr)
library(jsonlite)
library(dplyr)
library(tidyverse)
library(e1071)
library(mvtnorm)
library(caret)
library(mapproj)
library(plotly)
library(usmap)
library(maps)
library(scales)
library(tidymodels)
library(kknn)

#1. How many breweries are present in each state?

######Import Data#######
Beer = read.csv(file.choose(),header = TRUE) 
Breweries = read.csv(file.choose(),header = TRUE)

######Count#######
Count <- Breweries %>% group_by(State) %>% summarise(Brew_ID = n())
Count
## # A tibble: 51 × 2
##    State Brew_ID
##    <chr>   <int>
##  1 " AK"       7
##  2 " AL"       3
##  3 " AR"       2
##  4 " AZ"      11
##  5 " CA"      39
##  6 " CO"      47
##  7 " CT"       8
##  8 " DC"       1
##  9 " DE"       2
## 10 " FL"      15
## # … with 41 more rows
######Plot of Count by State#######
Breweries %>% ggplot(aes(x = State, fill=State)) + geom_bar() + ggtitle("Count of Breweries in each State") + ylab("Number of Breweries") + geom_text(aes(label = ..count..), stat = "count", vjust=-.2, colour = "black") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

#2. Merge beer data with the breweries data. Print the first 6 observations and the last six observations to check the merged file. (RMD only, this does not need to be included in the presentation or the deck.)

######Merge Data#######
Beer <- Beer %>% rename(Brew_ID = Brewery_id)
Beer <- Beer %>% rename(Beer_Name = Name)
Breweries <- Breweries %>% rename(Brewery_Name = Name)
Brewery_Beer <- merge(Beer,Breweries, by = "Brew_ID")

######Print Head and Tail#######
head(Brewery_Beer,6)
##   Brew_ID     Beer_Name Beer_ID   ABV IBU                               Style
## 1       1  Get Together    2692 0.045  50                        American IPA
## 2       1 Maggie's Leap    2691 0.049  26                  Milk / Sweet Stout
## 3       1    Wall's End    2690 0.048  19                   English Brown Ale
## 4       1       Pumpion    2689 0.060  38                         Pumpkin Ale
## 5       1    Stronghold    2688 0.060  25                     American Porter
## 6       1   Parapet ESB    2687 0.056  47 Extra Special / Strong Bitter (ESB)
##   Ounces       Brewery_Name        City State
## 1     16 NorthGate Brewing  Minneapolis    MN
## 2     16 NorthGate Brewing  Minneapolis    MN
## 3     16 NorthGate Brewing  Minneapolis    MN
## 4     16 NorthGate Brewing  Minneapolis    MN
## 5     16 NorthGate Brewing  Minneapolis    MN
## 6     16 NorthGate Brewing  Minneapolis    MN
tail(Brewery_Beer,6)
##      Brew_ID                 Beer_Name Beer_ID   ABV IBU
## 2405     556             Pilsner Ukiah      98 0.055  NA
## 2406     557  Heinnieweisse Weissebier      52 0.049  NA
## 2407     557           Snapperhead IPA      51 0.068  NA
## 2408     557         Moo Thunder Stout      50 0.049  NA
## 2409     557         Porkslap Pale Ale      49 0.043  NA
## 2410     558 Urban Wilderness Pale Ale      30 0.049  NA
##                        Style Ounces                  Brewery_Name          City
## 2405         German Pilsener     12         Ukiah Brewing Company         Ukiah
## 2406              Hefeweizen     12       Butternuts Beer and Ale Garrattsville
## 2407            American IPA     12       Butternuts Beer and Ale Garrattsville
## 2408      Milk / Sweet Stout     12       Butternuts Beer and Ale Garrattsville
## 2409 American Pale Ale (APA)     12       Butternuts Beer and Ale Garrattsville
## 2410        English Pale Ale     12 Sleeping Lady Brewing Company     Anchorage
##      State
## 2405    CA
## 2406    NY
## 2407    NY
## 2408    NY
## 2409    NY
## 2410    AK

#3. Address the missing values in each column.

colSums(is.na(Brewery_Beer))
##      Brew_ID    Beer_Name      Beer_ID          ABV          IBU        Style 
##            0            0            0           62         1005            0 
##       Ounces Brewery_Name         City        State 
##            0            0            0            0
nrow(Brewery_Beer)
## [1] 2410
BB_NoNA = na.omit(Brewery_Beer)
colSums(is.na(BB_NoNA))
##      Brew_ID    Beer_Name      Beer_ID          ABV          IBU        Style 
##            0            0            0            0            0            0 
##       Ounces Brewery_Name         City        State 
##            0            0            0            0
nrow(BB_NoNA)
## [1] 1405

#4. Compute the median alcohol content and international bitterness unit for each state. Plot a bar chart to compare.

##########Median ABV and IBU for each State#############
ST_ABV <- aggregate(x= BB_NoNA$ABV, by = list(BB_NoNA$State),FUN = median)
ST_IBU <- aggregate(x= BB_NoNA$IBU, by = list(BB_NoNA$State),FUN = median)
ST_ABV <- ST_ABV %>% rename(Med_ABV = x,State = Group.1)
ST_IBU <- ST_IBU %>% rename(Med_IBU = x,State = Group.1)
######Turn ABV into a %#######
ST_ABV$Med_ABV_Rounded <- round(ST_ABV$Med_ABV ,digit=4)
ST_ABV$Med_ABVPCT <- percent(ST_ABV$Med_ABV_Rounded, accuracy = .01)
######Round INB for Scaling######
ST_IBU$Med_IBU_Rounded <- round(ST_IBU$Med_IBU ,digit=2)
######Plot ABV######
ST_ABV$State <- fct_reorder(ST_ABV$State, ST_ABV$Med_ABV)
ST_ABV %>% ggplot(aes(x = State, y=Med_ABVPCT, fill=State)) + geom_bar(stat="identity") + ggtitle("Med_ABV by State") + ylab("Med_ABV") + geom_text(aes(label = Med_ABVPCT), vjust=0, size= 2, colour = "black")

######Plot IBU######
ST_IBU$State <- fct_reorder(ST_IBU$State, ST_IBU$Med_IBU)
ST_IBU %>% ggplot(aes(x = State, y=Med_IBU_Rounded, fill=State)) + geom_bar(stat="identity") + ggtitle("Med_IBU by State") + ylab("Med_IBU") + geom_text(aes(label = Med_IBU_Rounded), vjust=0, size=2, colour = "black")

######Merge for a cross report######
ST_IBU_ABV <- merge(ST_IBU,ST_ABV, by = "State")
ST_IBU_ABV %>% ggplot(aes(x = Med_ABVPCT, y = Med_IBU_Rounded, color = State)) + geom_jitter() + ggtitle("Med_IBU and Mean_ABV") + xlab("Med_ABV") + ylab("Med_IBU")

######Create Bins for reporting######
ABVPCTFact = cut(ST_IBU_ABV$Med_ABV, breaks = c(.01,.05,.06,.07), labels = c("Low","Medium","High"))
######Plot the cross report######
ST_IBU_ABV %>%  mutate(ABVPCTFact = ABVPCTFact) %>%  ggplot(aes(x = Med_ABVPCT, y = Med_IBU_Rounded, color = ABVPCTFact)) + geom_jitter() + ggtitle("Med_IBU and Med_ABV") + xlab("Med_ABV") + ylab("Med_IBU")

#5. Which state has the maximum alcoholic (ABV) beer? Which state has the most bitter (IBU) beer?

Brewery_Beer %>% slice_max(ABV)
##   Brew_ID                                            Beer_Name Beer_ID   ABV
## 1      52 Lee Hill Series Vol. 5 - Belgian Style Quadrupel Ale    2565 0.128
##   IBU            Style Ounces            Brewery_Name    City State
## 1  NA Quadrupel (Quad)   19.2 Upslope Brewing Company Boulder    CO
Brewery_Beer %>% slice_min(ABV)
##   Brew_ID   Beer_Name Beer_ID   ABV IBU            Style Ounces
## 1     523 Scotty K NA     606 0.001  NA Low Alcohol Beer     16
##       Brewery_Name       City State
## 1 Uncommon Brewers Santa Cruz    CA

#6. Comment on the summary statistics and distribution of the ABV variable.

summary(BB_NoNA)
##     Brew_ID       Beer_Name            Beer_ID          ABV         
##  Min.   :  1.0   Length:1405        Min.   :   1   Min.   :0.02700  
##  1st Qu.: 95.0   Class :character   1st Qu.: 772   1st Qu.:0.05000  
##  Median :198.0   Mode  :character   Median :1439   Median :0.05700  
##  Mean   :224.2                      Mean   :1415   Mean   :0.05991  
##  3rd Qu.:351.0                      3rd Qu.:2069   3rd Qu.:0.06800  
##  Max.   :547.0                      Max.   :2692   Max.   :0.12500  
##       IBU            Style               Ounces      Brewery_Name      
##  Min.   :  4.00   Length:1405        Min.   : 8.40   Length:1405       
##  1st Qu.: 21.00   Class :character   1st Qu.:12.00   Class :character  
##  Median : 35.00   Mode  :character   Median :12.00   Mode  :character  
##  Mean   : 42.71                      Mean   :13.51                     
##  3rd Qu.: 64.00                      3rd Qu.:16.00                     
##  Max.   :138.00                      Max.   :32.00                     
##      City              State          
##  Length:1405        Length:1405       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 
hist(BB_NoNA$ABV)

ABVLevel = cut(BB_NoNA$ABV, breaks = c(.00,.049,.089,.13), labels = c("Low","Medium","High"))
BB_NoNA %>% mutate(ABVLevel= ABVLevel) %>% ggplot(aes(x = ABV, y= IBU, fill=ABVLevel)) + geom_histogram(stat="identity") + ggtitle("Distribution by ABV Level")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

#7. Is there an apparent relationship between the bitterness of the beer and its alcoholic content? Draw a scatter plot. Make your best judgment of a relationship and EXPLAIN your answer.

BB_NoNA$ABV_Rounded <- round(BB_NoNA$ABV ,digit=3)

BB_NoNA %>% mutate(ABVLevel= ABVLevel) %>%  ggplot(aes(x = IBU, y = ABV, color = ABVLevel)) + geom_jitter() + ggtitle("ABV vs. IBU by Levels") + xlab("International Bitterness Unit") + ylab("Alcohol By Volume")

#8. Budweiser would also like to investigate the difference with respect to IBU and ABV between IPAs (India Pale Ales) and other types of Ale (any beer with “Ale” in its name other than IPA). You decide to use KNN classification to investigate this relationship. Provide statistical evidence one way or the other. You can of course assume your audience is comfortable with percentages … KNN is very easy to understand conceptually. In addition, while you have decided to use KNN to investigate this relationship (KNN is required) you may also feel free to supplement your response to this question with any other methods or techniques you have learned. Creativity and alternative solutions are always encouraged.

## First, we set NA for blank values as a catch-all so that when we retrieve IPA and Ale's we do not run across the error of pulling null values.
Brewery_Beer$Class1 <- NA
for(i in 1:nrow(Brewery_Beer)){
  if(grepl("IPA",Brewery_Beer$Style[i]) == TRUE | grepl("India Pale Ale", Brewery_Beer$Style[i]) == TRUE){
    Brewery_Beer$Class1[i] <- "IPA"
  } else if(grepl("Ale", Brewery_Beer$Style[i]) == TRUE & !grepl("India Pale Ale", Brewery_Beer$Style[i]) == TRUE & !grepl("IPA", Brewery_Beer$Style[i]) == TRUE){
    Brewery_Beer$Class1[i] <- "Ale"
  } else{
    Brewery_Beer$Class1[i] <- NA
  }
}
#In this section, we run our KNN test with a randomly selected seed and the original data set with Null values for ABV and IBU still included. We decided to use the original set with null values as that contained a greater set of names than the removed NAs data set.
set.seed(200)
Brewery_BeerKnn <- na.omit(Brewery_Beer)
splitPerc <- 0.80
trainIndices <- sample(1:dim(Brewery_BeerKnn)[1], round(splitPerc * dim(Brewery_BeerKnn)[1]))
train <- Brewery_BeerKnn[trainIndices,]
test <- Brewery_BeerKnn[-trainIndices,]
train$ABV <- scale(train$ABV)
train$IBU <- scale(train$IBU)
test$ABV <- scale(test$ABV)
test$IBU <- scale(test$IBU)
accuracy <- c()
k <- c()
for(i in 1:90){
  classifications <- knn(train[, c(4, 5)], test[, c(4, 5)], as.factor(train$Class1), prob = TRUE, k = i)
  CM <- confusionMatrix(table(classifications, as.factor(test$Class1)))
  accuracy[i] <- CM$overall[1]
  k[i] <- i
  if(i == 1){
    max_k <- i
    accuracy_max <- accuracy[i]
  }else if(accuracy[i] >= accuracy_max){
    max_k <- i
    accuracy_max <- accuracy[i]
  }
}
plot(k, accuracy, type = "l", xlab = "k")

print(paste0("Max k:", max_k))
## [1] "Max k:36"
print(paste0("Max accuracy:", accuracy[max_k]))
## [1] "Max accuracy:0.894179894179894"
classifications <- knn(train[, c(4, 5)], test[, c(4, 5)], train$Class1, prob = TRUE, k = max_k)
  CM <- confusionMatrix(table(classifications, test$Class1))
  print(CM)
## Confusion Matrix and Statistics
## 
##                
## classifications Ale IPA
##             Ale 108  10
##             IPA  10  61
##                                           
##                Accuracy : 0.8942          
##                  95% CI : (0.8413, 0.9342)
##     No Information Rate : 0.6243          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7744          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9153          
##             Specificity : 0.8592          
##          Pos Pred Value : 0.9153          
##          Neg Pred Value : 0.8592          
##              Prevalence : 0.6243          
##          Detection Rate : 0.5714          
##    Detection Prevalence : 0.6243          
##       Balanced Accuracy : 0.8872          
##                                           
##        'Positive' Class : Ale             
## 
library(plotly)
x_test <- test %>% select("ABV", "IBU")
y_test <- test %>% select("Class1")
yscore <- knn(train[, c(4, 5)], test[, c(4, 5)], train$Class1, prob = TRUE, k = max_k)
yscore <- attributes(yscore)$prob

pdb <- cbind(x_test, y_test)
pdb <- cbind(pdb, yscore)

fig <- plot_ly(data = pdb,x = ~IBU, y = ~ABV, type = 'scatter', mode = 'markers',color = ~yscore, colors = 'RdBu', symbol = ~Class1, split = ~Class1, symbols = c('square-dot','circle-dot'), marker = list(size = 12, line = list(color = 'black', width = 1)))
fig
ggplot(train,aes(x=IBU,y=ABV,colour=Class1))+geom_point(size=0.3)+geom_density2d()

ggplot(train,aes(x=IBU,y=ABV,colour=Class1)) + geom_jitter() +geom_density2d() + ggtitle("Density Plot of ABV vs IBU")

  1. Knock their socks off! Find one other useful inference from the data that you feel Budweiser may be able to find value in. You must convince them why it is important and back up your conviction with appropriate statistical evidence.
fit.IBU <- aov(IBU~Style, data = Brewery_Beer)
print("ANOVA IBU")
## [1] "ANOVA IBU"
summary(fit.IBU)
##               Df Sum Sq Mean Sq F value Pr(>F)    
## Style         90 707451    7861   43.34 <2e-16 ***
## Residuals   1314 238303     181                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1005 observations deleted due to missingness
fit.ABV <- aov(ABV~Style, data = Brewery_Beer)
print("ANOVA ABV")
## [1] "ANOVA ABV"
summary(fit.ABV)
##               Df Sum Sq  Mean Sq F value Pr(>F)    
## Style         99 0.2703 2.73e-03   38.35 <2e-16 ***
## Residuals   2248 0.1601 7.12e-05                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 62 observations deleted due to missingness
#There is significant evidence to prove that at least one Style has a different IBU, and one style has a different ABV (p-value < 2e-16).As a result, one can infer that different styles are characterized by different combinations of bitterness and alcohol level.
Brewery_Beer %>% ggplot(aes(x = IBU, y = ABV, color = Style)) + geom_point() + theme(legend.position = "None") + ggtitle("ABV vs IBU by Style")
## Warning: Removed 1005 rows containing missing values (geom_point).